/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_full.dta
			pdb_full_entities.dta
			scooped_groups.dta

Outputs: 	pdb_full_structures.dta

Purpose: 	Generate a dataset unique at the structure level
			Generate new variables specific to this project

*******************************************************************************/

clear all
use "${data_built}pdb_full.dta", clear

* Manually input entity ID if missing (only occurs in cases w/ 1 entity)
assert numEntities == 1 if entityId == .
replace entityId = 1 if entityId == .
isid structureId entityId

* Fill in missing publication years if pub year is missing but PMID is present
replace publicationYear = year(releaseDate) if publicationYear == . 		///
	& pubmedId != . 

* Flag the relevant entity-level variables and keep these
local structureVars structureId numEntities structureTitle experimentalTech ///
	classification structureMolecularWt residueCount atomSiteCount 			///
	macroMoleculeType releaseDate depositionDate collectionDate 			///
	refinementResolution rObs rAll rWork rFree averageBFactor 				///
	determineMethod molecularReplacement firstStructureAuthor				/// 
	lastStructureAuthor numStructureAuthors rama* drugbank*

	keep pubmedId `structureVars' 
	duplicates drop // want data to be unique at structure level
	isid structureId
	order `structureVars' pubmedId
	sort structureId

* Generate new variables

	* Variable for structures in paper
	gen count = 1
	bys pubmedId: egen structuresInPaper = sum(count)
	replace structuresInPaper = . if pubmedId == .
	drop count
	
	* Variable for structures in project
	* (same first/last author, same release date)
	gen count = 1
	bys firstStructureAuthor lastStructureAuthor releaseDate: 				///
		egen structuresInProject = sum(count)
	drop count
	
	* Create a project ID (like a PubMed ID)
	egen double projectId = 												///
		group(firstStructureAuthor lastStructureAuthor releaseDate), missing
	egen temp = max(pubmedId)
	replace projectId = projectId + temp
	drop temp
	
	* Variable for structural genomics structures 
	* List of structural genomics from: 
	* https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4834271/table/T1/?report=objectonly
	replace lastStructureAuthor = lower(lastStructureAuthor)
	
	gen structuralGenomics = 												///
		strpos(lastStructureAuthor, "structural genomics") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "structural proteomics") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "enzyme") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "membrane") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "accelerated technologies center") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "cell-cell and cell-matrix adhesions") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "immune function network") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "bacterial targets") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "high-throughput") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "membrane proteins") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "bacterial targets") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "gpcr network") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "integrated center for structure") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "mitochondrial") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "nucleocytoplasmic") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "oxford protein production") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "nuclear receptor") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "partnership for") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "program for the characterization") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "protein structure factory") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "structure 2 function") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "toxin-immunity protein complexes") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "structures of mtb proteins") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "midwest center") > 0
	replace structuralGenomics = 1 if 										///
		strpos(lastStructureAuthor, "new york structural") > 0

	
	* Membrane protein indicator
	replace classification = lower(classification)
	gen membrane_protein = strpos(classification, "membrane") > 0
	
	* Log molecular weight
	gen lnStructureMolecularWt = ln(structureMolecularWt)
	gen lnStructureMolecularWt2 = lnStructureMolecularWt^2
	
	* Log residue count
	gen lnResidueCount = ln(residueCount)
	gen lnResidueCount2 = lnResidueCount^2
	
	* Log atom site count
	gen lnAtomSiteCount = ln(atomSiteCount)
	gen lnAtomSiteCount2 = lnAtomSiteCount^2
	
	* Standardized time outcomes
 	gen maturation = (depositionDate - collectionDate)/365
	replace maturation = 0 if maturation < 0
	lab var maturation "Maturation period"
	
	* Deposition year
	gen depositionYear = year(depositionDate)
	
	* Deposition year dummies
	forval y = 1993/2018 {
		gen deposition`y' = 0
		replace deposition`y' = 1 if depositionYear == `y'
	}
	
	* Create average cluster counts within structures 
	* (averaging over PRIORITY entities in structure)
	preserve
		use "${data_built}pdb_full_entities.dta", clear
		drop if priorityEntity == 0
		collapse (mean) clusterCount clusterCount1Yr clusterCount2Yr 		///
			clusterCountPost1Yr clusterCountPost2Yr clusterCountRace 		///
			clusterCountNonRace, by(structureId)
		tempfile count
		save `count'
	restore
	merge 1:1 structureId using `count'
	assert _merge != 2
	drop _merge
	
	gen lnClusterCount = ln(clusterCount)
	gen lnClusterCount1Yr = ln(clusterCount1Yr)
	gen lnClusterCount2Yr = ln(clusterCount2Yr)
	gen lnClusterCountRace = ln(clusterCountRace)
	
	* Create priority structure variable (if any entity is a priority entity)
	preserve
		use "${data_built}pdb_full_entities.dta", clear	
		bys structureId: egen priorityStructure = max(priorityEntity) 
	
		* Only keep priority structure and structure ID variables			
		* Reduce data to structure level
		keep structureId priorityStructure
		duplicates drop
		tempfile priority
		save `priority'
		
	restore
	merge 1:1 structureId using `priority'
	assert _merge == 3
	drop _merge
	
	* Create a variable for subsequent deposit
	gen subsequentDeposit = (clusterCountNonRace > 0)
	
	* Create race dummy outcomes
	preserve
		use "${data_raw}PDB/scooped_groups.dta", clear
		keep if sampleBlind == 1
		sort clusterId releaseDate
		duplicates drop structureId, force
		tempfile race
		save `race'
	restore
	merge 1:1 structureId using `race'
	assert _merge != 2
	gen priorityRace = (_merge == 3)
	drop _merge
	
	* Create a variable for best structure quality 
	* (if structure is deposited multiple times)	
	preserve
	
	* Merge on entities, assign each entity the quality of the structure
	merge 1:m structureId using "${data_built}pdb_full_entities.dta", 	///
		keepusing(clusterNum100)
	assert _merge == 3
	drop _merge
	drop if clusterNum100 == .
	keep if experimentalTechnique == "X-RAY DIFFRACTION" 
	
	* Compute the best quality for each entity
	bys clusterNum100: egen refinementResolutionBest = min(refinementResolution)
	bys clusterNum100: egen rFreeBest = min(rFree)
	bys clusterNum100: egen ramaRawBest = min(ramaRaw)
	
	* Collapse back down to the structure level
	collapse refinementResolutionBest rFreeBest ramaRawBest, by(structureId)
	
	* Save these best structures
	tempfile best_by_structure
	save `best_by_structure'
	
	* Merge them on to main data
	restore
	
	merge 1:1 structureId using `best_by_structure'
	assert _merge != 2
	drop _merge
	
	
save "${data_built}pdb_full_structures.dta", replace	
	
	
	
	
